import scrapy
from scrapy import Request
from poetry.items import CipaiItem, ArticleItemLoader,PoetryItem
from scrapy.loader import ItemLoader

base_url = 'https://www.shicimingju.com'

class CipaiSpider(scrapy.Spider):
    name = 'cipai'
    allowed_domains = ['www.shicimingju.com']
    start_urls = ['https://www.shicimingju.com/cipai/index.html']

    # 'dont_redirect': True是禁止重定向
    # Request.meta 中的 handle_httpstatus_list 键可以用来指定每个request所允许的response code。
    def start_requests(self):
        for i in self.start_urls:
            yield Request(i, meta={
                'dont_redirect': True,
                'handle_httpstatus_list': [301, 302]
            }, callback=self.parse)


    """解析词牌列表"""
    def parse(self, response):
        title = '//*[@id="main_left"]//ul/li/a/text()'
        url = '//*[@id="main_left"]//ul/li/a/@href'
        item_loader = ItemLoader(item=CipaiItem(), response=response)
        item_loader.add_xpath('title', title)
        item_loader.add_xpath('url', url)
        urls = response.xpath(url).extract()
        yield item_loader.load_item()
        for post_url in urls:
            if post_url.startswith('http'):
                yield Request(post_url, callback=self.parse2, meta={"url": post_url})
            else:
                yield Request('https://www.shicimingju.com' + post_url, callback=self.parse2, meta={"url": post_url})

    """解析词牌下所有诗"""
    def parse2(self, response):
        urls = response.xpath('//*[@id="main_left"]//div[@class="shici_list_main"]/h3/a/@href').extract\
            ()
        for post_url in urls:
            if post_url.endswith('html'):
                yield Request('https://www.shicimingju.com'+post_url, callback=self.parse_detail, meta={"url": post_url})
        pass
    """诗词详情"""
    def parse_detail(self, response):
        item_loader = ArticleItemLoader(item=PoetryItem(), response=response)
        url = response.meta.get("url", "")
        title = response.xpath('//*[@id="zs_title"]').xpath('string(.)').extract()
        tag = response.xpath('//*[@class="shici-mark"]/a/text()').extract()
        content = response.xpath('//*[@id="zs_content"]').xpath('string(.)').extract()
        annotation = response.xpath('//*[@id="item_shangxi"]').xpath('string(.)').extract()
        images = response.xpath('//*[@id="item_div"]/img/@src').extract_first()

        item_loader.add_value('url', url)
        item_loader.add_value('title', title)
        item_loader.add_xpath('dynasty', '//div[@class="niandai_zuozhe"]/text()')
        item_loader.add_xpath('author', '//div[@class="niandai_zuozhe"]/a/text()')
        item_loader.add_value('content', content)
        item_loader.add_value('annotation', annotation)
        item_loader.add_value('tag', tag)
        item_loader.add_xpath('relation', '//*[@class="shici_list_main"]/h3/a/text()')
        if images:
            item_loader.add_value('images', base_url + images)
        yield item_loader.load_item()